rm(list = ls())
doParallel::registerDoParallel()
library(tidyverse)
library(tidymodels)
library(readr)
library(tswge)
library(ggplot2)
library(yardstick)
library(workflows)
library(parsnip)
library(tidyclust)
library(RcppHungarian)
acfdf <- function(vec) {
vacf <- acf(vec, plot = F)
with(vacf, data.frame(lag, acf))
}
ggacf <- function(vec) {
ac <- acfdf(vec)
ggplot(data = ac, aes(x = lag, y = acf)) + geom_hline(aes(yintercept = 0)) +
geom_segment(mapping = aes(xend = lag, yend = 0))
}
tplot <- function(vec) {
df <- data.frame(X = vec, t = seq_along(vec))
ggplot(data = df, aes(x = t, y = X)) + geom_line()
}Biostat 212a Homework 6
Due Mar 22, 2024 @ 11:59PM
Load R libraries.
1 New York Stock Exchange (NYSE) data (1962-1986) (140 pts)
The NYSE.csv file contains three daily time series from the New York Stock Exchange (NYSE) for the period Dec 3, 1962-Dec 31, 1986 (6,051 trading days).
Log trading volume(\(v_t\)): This is the fraction of all outstanding shares that are traded on that day, relative to a 100-day moving average of past turnover, on the log scale.Dow Jones return(\(r_t\)): This is the difference between the log of the Dow Jones Industrial Index on consecutive trading days.Log volatility(\(z_t\)): This is based on the absolute values of daily price movements.
# Read in NYSE data from url
url = "https://raw.githubusercontent.com/ucla-biostat-212a/2024winter/master/slides/data/NYSE.csv"
NYSE <- read_csv(url)
NYSE# A tibble: 6,051 × 6
date day_of_week DJ_return log_volume log_volatility train
<date> <chr> <dbl> <dbl> <dbl> <lgl>
1 1962-12-03 mon -0.00446 0.0326 -13.1 TRUE
2 1962-12-04 tues 0.00781 0.346 -11.7 TRUE
3 1962-12-05 wed 0.00384 0.525 -11.7 TRUE
4 1962-12-06 thur -0.00346 0.210 -11.6 TRUE
5 1962-12-07 fri 0.000568 0.0442 -11.7 TRUE
6 1962-12-10 mon -0.0108 0.133 -10.9 TRUE
7 1962-12-11 tues 0.000124 -0.0115 -11.0 TRUE
8 1962-12-12 wed 0.00336 0.00161 -11.0 TRUE
9 1962-12-13 thur -0.00330 -0.106 -11.0 TRUE
10 1962-12-14 fri 0.00447 -0.138 -11.0 TRUE
# ℹ 6,041 more rows
The autocorrelation at lag \(\ell\) is the correlation of all pairs \((v_t, v_{t-\ell})\) that are \(\ell\) trading days apart. These sizable correlations give us confidence that past values will be helpful in predicting the future.
Code
ggacf(NYSE$log_volume) + ggthemes::theme_few()Do a similar plot for (1) the correlation between \(v_t\) and lag \(\ell\) Dow Jones return \(r_{t-\ell}\) and (2) correlation between \(v_t\) and lag \(\ell\) Log volatility \(z_{t-\ell}\).
Code
seq(1, 30) %>%
map(function(x) {cor(NYSE$log_volume , lag(NYSE$DJ_return, x), use = "pairwise.complete.obs")}) %>%
unlist() %>%
tibble(lag = 1:30, cor = .) %>%
ggplot(aes(x = lag, y = cor)) +
geom_hline(aes(yintercept = 0)) +
geom_segment(mapping = aes(xend = lag, yend = 0)) +
ggtitle("AutoCorrelation between `log volume` and lagged `DJ return`")Code
seq(1, 30) %>%
map(function(x) {cor(NYSE$log_volume , lag(NYSE$log_volatility, x), use = "pairwise.complete.obs")}) %>%
unlist() %>%
tibble(lag = 1:30, cor = .) %>%
ggplot(aes(x = lag, y = cor)) +
geom_hline(aes(yintercept = 0)) +
geom_segment(mapping = aes(xend = lag, yend = 0)) +
ggtitle("AutoCorrelation between `log volume` and lagged `log volatility`")1.1 Project goal
Our goal is to forecast daily Log trading volume, using various machine learning algorithms we learnt in this class.
The data set is already split into train (before Jan 1st, 1980, \(n_{\text{train}} = 4,281\)) and test (after Jan 1st, 1980, \(n_{\text{test}} = 1,770\)) sets.
In general, we will tune the lag \(L\) to acheive best forecasting performance. In this project, we would fix \(L=5\). That is we always use the previous five trading days’ data to forecast today’s log trading volume.
Pay attention to the nuance of splitting time series data for cross validation. Study and use the time-series functionality in tidymodels. Make sure to use the same splits when tuning different machine learning algorithms.
Use the \(R^2\) between forecast and actual values as the cross validation and test evaluation criterion.
1.2 Baseline method (20 pts)
We use the straw man (use yesterday’s value of log trading volume to predict that of today) as the baseline method. Evaluate the \(R^2\) of this method on the test data.
Before we tune different machine learning methods, let’s first separate into the test and non-test sets. We drop the first 5 trading days which lack some lagged variables.
# Lag: look back L trading days
# Do not need to include, as we included them in receipe
L = 5
for(i in seq(1, L)) {
NYSE <- NYSE %>%
mutate(!!paste("DJ_return_lag", i, sep = "") := lag(NYSE$DJ_return, i),
!!paste("log_volume_lag", i, sep = "") := lag(NYSE$log_volume, i),
!!paste("log_volatility_lag", i, sep = "") := lag(NYSE$log_volatility, i))
}
NYSE <- NYSE %>% na.omit() %>%
print(width = Inf)# A tibble: 6,046 × 21
date day_of_week DJ_return log_volume log_volatility train
<date> <chr> <dbl> <dbl> <dbl> <lgl>
1 1962-12-10 mon -0.0108 0.133 -10.9 TRUE
2 1962-12-11 tues 0.000124 -0.0115 -11.0 TRUE
3 1962-12-12 wed 0.00336 0.00161 -11.0 TRUE
4 1962-12-13 thur -0.00330 -0.106 -11.0 TRUE
5 1962-12-14 fri 0.00447 -0.138 -11.0 TRUE
6 1962-12-17 mon -0.00402 -0.0496 -11.0 TRUE
7 1962-12-18 tues -0.00832 -0.0434 -10.7 TRUE
8 1962-12-19 wed 0.0107 0.0536 -10.4 TRUE
9 1962-12-20 thur 0.00239 0.105 -10.5 TRUE
10 1962-12-21 fri -0.00330 -0.0890 -10.5 TRUE
DJ_return_lag1 log_volume_lag1 log_volatility_lag1 DJ_return_lag2
<dbl> <dbl> <dbl> <dbl>
1 0.000568 0.0442 -11.7 -0.00346
2 -0.0108 0.133 -10.9 0.000568
3 0.000124 -0.0115 -11.0 -0.0108
4 0.00336 0.00161 -11.0 0.000124
5 -0.00330 -0.106 -11.0 0.00336
6 0.00447 -0.138 -11.0 -0.00330
7 -0.00402 -0.0496 -11.0 0.00447
8 -0.00832 -0.0434 -10.7 -0.00402
9 0.0107 0.0536 -10.4 -0.00832
10 0.00239 0.105 -10.5 0.0107
log_volume_lag2 log_volatility_lag2 DJ_return_lag3 log_volume_lag3
<dbl> <dbl> <dbl> <dbl>
1 0.210 -11.6 0.00384 0.525
2 0.0442 -11.7 -0.00346 0.210
3 0.133 -10.9 0.000568 0.0442
4 -0.0115 -11.0 -0.0108 0.133
5 0.00161 -11.0 0.000124 -0.0115
6 -0.106 -11.0 0.00336 0.00161
7 -0.138 -11.0 -0.00330 -0.106
8 -0.0496 -11.0 0.00447 -0.138
9 -0.0434 -10.7 -0.00402 -0.0496
10 0.0536 -10.4 -0.00832 -0.0434
log_volatility_lag3 DJ_return_lag4 log_volume_lag4 log_volatility_lag4
<dbl> <dbl> <dbl> <dbl>
1 -11.7 0.00781 0.346 -11.7
2 -11.6 0.00384 0.525 -11.7
3 -11.7 -0.00346 0.210 -11.6
4 -10.9 0.000568 0.0442 -11.7
5 -11.0 -0.0108 0.133 -10.9
6 -11.0 0.000124 -0.0115 -11.0
7 -11.0 0.00336 0.00161 -11.0
8 -11.0 -0.00330 -0.106 -11.0
9 -11.0 0.00447 -0.138 -11.0
10 -10.7 -0.00402 -0.0496 -11.0
DJ_return_lag5 log_volume_lag5 log_volatility_lag5
<dbl> <dbl> <dbl>
1 -0.00446 0.0326 -13.1
2 0.00781 0.346 -11.7
3 0.00384 0.525 -11.7
4 -0.00346 0.210 -11.6
5 0.000568 0.0442 -11.7
6 -0.0108 0.133 -10.9
7 0.000124 -0.0115 -11.0
8 0.00336 0.00161 -11.0
9 -0.00330 -0.106 -11.0
10 0.00447 -0.138 -11.0
# ℹ 6,036 more rows
# Drop beginning trading days which lack some lagged variables
NYSE_other <- NYSE %>%
filter(train == 'TRUE') %>%
select(-train) %>%
drop_na()
dim(NYSE_other)[1] 4276 20
NYSE_test = NYSE %>%
filter(train == 'FALSE') %>%
select(-train) %>%
drop_na()
dim(NYSE_test)[1] 1770 20
library(yardstick)
# cor(NYSE_test$log_volume, NYSE_test$log_volume_lag1) %>% round(2)
r2_test_strawman = rsq_vec(NYSE_test$log_volume, lag(NYSE_test$log_volume, 1)) %>% round(2)
print(paste("Straw man test R2: ", r2_test_strawman))[1] "Straw man test R2: 0.35"
1.3 Autoregression (AR) forecaster (30 pts)
Let \[ y = \begin{pmatrix} v_{L+1} \\ v_{L+2} \\ v_{L+3} \\ \vdots \\ v_T \end{pmatrix}, \quad M = \begin{pmatrix} 1 & v_L & v_{L-1} & \cdots & v_1 \\ 1 & v_{L+1} & v_{L} & \cdots & v_2 \\ \vdots & \vdots & \vdots & \ddots & \vdots \\ 1 & v_{T-1} & v_{T-2} & \cdots & v_{T-L} \end{pmatrix}. \]
Fit an ordinary least squares (OLS) regression of \(y\) on \(M\), giving \[ \hat v_t = \hat \beta_0 + \hat \beta_1 v_{t-1} + \hat \beta_2 v_{t-2} + \cdots + \hat \beta_L v_{t-L}, \] known as an order-\(L\) autoregression model or AR(\(L\)).
Tune AR(5) with elastic net (lasso + ridge) regularization using all 3 features on the training data, and evaluate the test performance.
Hint: Workflow: Lasso is a good starting point.
Before we start the model training, let’s talk about time series resampling. We will use the
rolling_originfunction in thersamplepackage to create a time series cross-validation plan.When the data have a strong time component, a resampling method should support modeling to estimate seasonal and other temporal trends within the data. A technique that randomly samples values from the training set can disrupt the model’s ability to estimate these patterns.
NYSE %>%
ggplot(aes(x = date, y = log_volume)) +
geom_line() +
geom_smooth(method = "lm")wrong_split <- initial_split(NYSE_other)
bind_rows(
training(wrong_split) %>% mutate(type = "train"),
testing(wrong_split) %>% mutate(type = "test")
) %>%
ggplot(aes(x = date, y = log_volume, color = type, group = NA)) +
geom_line()correct_split <- initial_time_split(NYSE_other %>% arrange(date))
bind_rows(
training(correct_split) %>% mutate(type = "train"),
testing(correct_split) %>% mutate(type = "test")
) %>%
ggplot(aes(x = date, y = log_volume, color = type, group = NA)) +
geom_line()rolling_origin(NYSE_other %>% arrange(date), initial = 30, assess = 7) %>%
#sliding_period(NYSE_other %>% arrange(date), date, period = "day", lookback = Inf, assess_stop = 1) %>%
mutate(train_data = map(splits, analysis),
test_data = map(splits, assessment)) %>%
select(-splits) %>%
pivot_longer(-id) %>%
filter(id %in% c("Slice0001", "Slice0002", "Slice0003")) %>%
unnest(value) %>%
ggplot(aes(x = date, y = log_volume, color = name, group = NA)) +
geom_point() +
geom_line() +
facet_wrap(~id, scales = "fixed")sliding_period(NYSE_other %>% arrange(date),
date, period = "month", lookback = Inf, assess_stop = 1) %>%
mutate(train_data = map(splits, analysis),
test_data = map(splits, assessment)) %>%
select(-splits) %>%
pivot_longer(-id) %>%
filter(id %in% c("Slice001", "Slice002", "Slice003")) %>%
unnest(value) %>%
ggplot(aes(x = date, y = log_volume, color = name, group = NA)) +
geom_point() +
geom_line() +
facet_wrap(~id, scales = "fixed")Rolling forecast origin resampling (Hyndman and Athanasopoulos 2018) provides a method that emulates how time series data is often partitioned in practice, estimating the model with historical data and evaluating it with the most recent data.
Tune AR(5) with elastic net (lasso + ridge) regularization using all 3 features on the training data, and evaluate the test performance.
1.4 Preprocessing
en_receipe <-
recipe(log_volume ~ ., data = NYSE_other) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_normalize(all_numeric_predictors(), -all_outcomes()) %>%
step_naomit(all_predictors()) %>%
prep(data = NYSE_other)
en_receipe1.5 Model training
### Model
enet_mod <-
# mixture = 0 (ridge), mixture = 1 (lasso)
# mixture = (0, 1) elastic net
# As an example, we set mixture = 0.5. It needs to be tuned.
linear_reg(penalty = tune(), mixture = 0.5) %>%
set_engine("glmnet")
enet_modLinear Regression Model Specification (regression)
Main Arguments:
penalty = tune()
mixture = 0.5
Computational engine: glmnet
en_wf <-
workflow() %>%
add_model(enet_mod) %>%
add_recipe(en_receipe %>% step_rm(date) %>% step_indicate_na())
en_wf══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: linear_reg()
── Preprocessor ────────────────────────────────────────────────────────────────
5 Recipe Steps
• step_dummy()
• step_normalize()
• step_naomit()
• step_rm()
• step_indicate_na()
── Model ───────────────────────────────────────────────────────────────────────
Linear Regression Model Specification (regression)
Main Arguments:
penalty = tune()
mixture = 0.5
Computational engine: glmnet
folds <- NYSE_other %>% arrange(date) %>%
sliding_period(date, period = "month", lookback = Inf, assess_stop = 1)
# rolling_origin(initial = 5, assess = 1)
month_folds <- NYSE_other %>%
sliding_period(
date,
"month",
lookback = Inf,
skip = 4)en_grid <-
grid_regular(penalty(range = c(-8, -7), trans = log10_trans()), levels = 3)
en_grid# A tibble: 3 × 1
penalty
<dbl>
1 0.00000001
2 0.0000000316
3 0.0000001
library(randomForest)
en_fit <- tune_grid(en_wf, resamples = month_folds, grid = en_grid)
en_fit# Tuning results
# Sliding period resampling
# A tibble: 200 × 4
splits id .metrics .notes
<list> <chr> <list> <list>
1 <split [98/22]> Slice001 <tibble [6 × 5]> <tibble [0 × 3]>
2 <split [120/20]> Slice002 <tibble [6 × 5]> <tibble [0 × 3]>
3 <split [140/22]> Slice003 <tibble [6 × 5]> <tibble [0 × 3]>
4 <split [162/22]> Slice004 <tibble [6 × 5]> <tibble [0 × 3]>
5 <split [184/20]> Slice005 <tibble [6 × 5]> <tibble [0 × 3]>
6 <split [204/23]> Slice006 <tibble [6 × 5]> <tibble [0 × 3]>
7 <split [227/18]> Slice007 <tibble [6 × 5]> <tibble [0 × 3]>
8 <split [245/21]> Slice008 <tibble [6 × 5]> <tibble [0 × 3]>
9 <split [266/22]> Slice009 <tibble [6 × 5]> <tibble [0 × 3]>
10 <split [288/19]> Slice010 <tibble [6 × 5]> <tibble [0 × 3]>
# ℹ 190 more rows
CV R2:
cv_rsq_en <- en_fit %>%
collect_metrics() %>%
filter(.metric == "rsq") %>%
pull(mean)
cv_rsq_en <- mean(cv_rsq_en) %>% round(2)
print(paste("AR(5) with elastic net cv R2: ", cv_rsq_en))[1] "AR(5) with elastic net cv R2: 0.38"
Test R2:
en_fit %>%
show_best("rsq")# A tibble: 3 × 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.00000001 rsq standard 0.381 200 0.0149 Preprocessor1_Model1
2 0.0000000316 rsq standard 0.381 200 0.0149 Preprocessor1_Model2
3 0.0000001 rsq standard 0.381 200 0.0149 Preprocessor1_Model3
best_en <- en_fit %>%
select_best("rsq")
best_en# A tibble: 1 × 2
penalty .config
<dbl> <chr>
1 0.00000001 Preprocessor1_Model1
# Final workflow
final_wf_en <- en_wf %>%
finalize_workflow(best_en)
final_wf_en══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: linear_reg()
── Preprocessor ────────────────────────────────────────────────────────────────
5 Recipe Steps
• step_dummy()
• step_normalize()
• step_naomit()
• step_rm()
• step_indicate_na()
── Model ───────────────────────────────────────────────────────────────────────
Linear Regression Model Specification (regression)
Main Arguments:
penalty = 1e-08
mixture = 0.5
Computational engine: glmnet
# Fit the whole training set, then predict the test cases
final_fit_en <- fit(final_wf_en, data = NYSE_other)
predictions_en <- predict(final_fit_en, NYSE_test)
predictions_en# A tibble: 1,770 × 1
.pred
<dbl>
1 0.0432
2 0.0936
3 0.104
4 -0.00490
5 0.437
6 0.428
7 0.340
8 0.280
9 0.213
10 0.387
# ℹ 1,760 more rows
#test metrics
results <- bind_cols(NYSE_test %>% select(log_volume), predictions_en)
results# A tibble: 1,770 × 2
log_volume .pred
<dbl> <dbl>
1 0.118 0.0432
2 0.332 0.0936
3 0.0780 0.104
4 0.206 -0.00490
5 0.386 0.437
6 0.582 0.428
7 0.423 0.340
8 0.361 0.280
9 0.358 0.213
10 0.343 0.387
# ℹ 1,760 more rows
test_rsq_en <- rsq(results, truth = log_volume, estimate = .pred) %>%
filter(.metric == "rsq") %>%
pull(.estimate) %>% round(2)
print(paste("AR(5) with elastic net test R2: ", test_rsq_en))[1] "AR(5) with elastic net test R2: 0.55"
1.6 Random forest forecaster (30pts)
Use the same features as in AR(\(L\)) for the random forest. Tune the random forest and evaluate the test performance.
Hint: Workflow: Random Forest for Prediction is a good starting point.
rf_receipe <-
recipe(log_volume ~ ., data = NYSE_other) %>%
step_naomit(log_volume) %>%
step_zv(all_numeric_predictors()) %>%
prep(data = NYSE_other, retain = TRUE)
rf_receipe# Model
rf_mod <-
rand_forest(
mode = "regression",
# Number of predictors randomly sampled in each split
mtry = tune(),
# Number of trees in ensemble
trees = tune()
) %>%
set_engine("ranger")
rf_modRandom Forest Model Specification (regression)
Main Arguments:
mtry = tune()
trees = tune()
Computational engine: ranger
rf_wf <-
workflow() %>%
add_model(rf_mod) %>%
add_recipe(rf_receipe %>% step_rm(date) %>% step_indicate_na())
rf_wf══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()
── Preprocessor ────────────────────────────────────────────────────────────────
4 Recipe Steps
• step_naomit()
• step_zv()
• step_rm()
• step_indicate_na()
── Model ───────────────────────────────────────────────────────────────────────
Random Forest Model Specification (regression)
Main Arguments:
mtry = tune()
trees = tune()
Computational engine: ranger
folds <- NYSE_other %>% arrange(date) %>%
sliding_period(date, period = "month", lookback = Inf, assess_stop = 1)
# rolling_origin(initial = 5, assess = 1)
month_folds <- NYSE_other %>%
sliding_period(
date,
"month",
lookback = Inf,
skip = 4)rf_grid <-
grid_regular(
trees(range = c(100L, 300L)),
mtry(range = c(1L, 5L)),
levels = c(3, 5)
)
rf_grid# A tibble: 15 × 2
trees mtry
<int> <int>
1 100 1
2 200 1
3 300 1
4 100 2
5 200 2
6 300 2
7 100 3
8 200 3
9 300 3
10 100 4
11 200 4
12 300 4
13 100 5
14 200 5
15 300 5
rf_fit <- tune_grid(rf_wf, resamples = month_folds, grid = rf_grid)
rf_fit# Tuning results
# Sliding period resampling
# A tibble: 200 × 4
splits id .metrics .notes
<list> <chr> <list> <list>
1 <split [98/22]> Slice001 <tibble [30 × 6]> <tibble [0 × 3]>
2 <split [120/20]> Slice002 <tibble [30 × 6]> <tibble [0 × 3]>
3 <split [140/22]> Slice003 <tibble [30 × 6]> <tibble [0 × 3]>
4 <split [162/22]> Slice004 <tibble [30 × 6]> <tibble [0 × 3]>
5 <split [184/20]> Slice005 <tibble [30 × 6]> <tibble [0 × 3]>
6 <split [204/23]> Slice006 <tibble [30 × 6]> <tibble [0 × 3]>
7 <split [227/18]> Slice007 <tibble [30 × 6]> <tibble [0 × 3]>
8 <split [245/21]> Slice008 <tibble [30 × 6]> <tibble [0 × 3]>
9 <split [266/22]> Slice009 <tibble [30 × 6]> <tibble [0 × 3]>
10 <split [288/19]> Slice010 <tibble [30 × 6]> <tibble [0 × 3]>
# ℹ 190 more rows
CV R2:
cv_rsq_rf <- rf_fit %>%
collect_metrics() %>%
filter(.metric == "rsq") %>%
pull(mean)
cv_rsq_rf <- mean(cv_rsq_rf) %>% round(2)
print(paste("random forest cv R2: ", cv_rsq_rf))[1] "random forest cv R2: 0.34"
Test R2:
rf_fit %>%
show_best("rsq")# A tibble: 5 × 8
mtry trees .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 5 300 rsq standard 0.346 200 0.0147 Preprocessor1_Model15
2 4 300 rsq standard 0.345 200 0.0148 Preprocessor1_Model12
3 5 200 rsq standard 0.345 200 0.0147 Preprocessor1_Model14
4 4 200 rsq standard 0.345 200 0.0147 Preprocessor1_Model11
5 5 100 rsq standard 0.342 200 0.0148 Preprocessor1_Model13
best_rf <- rf_fit %>%
select_best("rsq")
best_rf# A tibble: 1 × 3
mtry trees .config
<int> <int> <chr>
1 5 300 Preprocessor1_Model15
# Final workflow
final_wf_rf <- rf_wf %>%
finalize_workflow(best_rf)
final_wf_rf══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()
── Preprocessor ────────────────────────────────────────────────────────────────
4 Recipe Steps
• step_naomit()
• step_zv()
• step_rm()
• step_indicate_na()
── Model ───────────────────────────────────────────────────────────────────────
Random Forest Model Specification (regression)
Main Arguments:
mtry = 5
trees = 300
Computational engine: ranger
# Fit the whole training set, then predict the test cases
final_fit_rf <- fit(final_wf_rf, data = NYSE_other)
predictions_rf <- predict(final_fit_rf, NYSE_test)
predictions_rf# A tibble: 1,770 × 1
.pred
<dbl>
1 -0.104
2 -0.0434
3 0.164
4 0.00579
5 0.283
6 0.360
7 0.363
8 0.306
9 0.341
10 0.382
# ℹ 1,760 more rows
#test metrics
results <- bind_cols(NYSE_test %>% select(log_volume), predictions_rf)
results# A tibble: 1,770 × 2
log_volume .pred
<dbl> <dbl>
1 0.118 -0.104
2 0.332 -0.0434
3 0.0780 0.164
4 0.206 0.00579
5 0.386 0.283
6 0.582 0.360
7 0.423 0.363
8 0.361 0.306
9 0.358 0.341
10 0.343 0.382
# ℹ 1,760 more rows
test_rsq_rf <- rsq(results, truth = log_volume, estimate = .pred) %>%
filter(.metric == "rsq") %>%
pull(.estimate) %>% round(2)
print(paste("random forest test R2: ", test_rsq_rf))[1] "random forest test R2: 0.51"
1.7 Boosting forecaster (30pts)
Use the same features as in AR(\(L\)) for the boosting. Tune the boosting algorithm and evaluate the test performance.
Hint: Workflow: Boosting tree for Prediction is a good starting point.
gb_receipe <-
recipe(log_volume ~ ., data = NYSE_other) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_naomit(log_volume) %>%
step_naomit(all_predictors()) %>%
step_zv(all_numeric_predictors()) %>%
prep(data = NYSE_other, retain = TRUE)
gb_receipe# Model
gb_mod <-
boost_tree(
mode = "regression",
trees = 1000,
tree_depth = tune(),
learn_rate = tune()
) %>%
set_engine("xgboost")
gb_modBoosted Tree Model Specification (regression)
Main Arguments:
trees = 1000
tree_depth = tune()
learn_rate = tune()
Computational engine: xgboost
gb_wf <-
workflow() %>%
add_model(gb_mod) %>%
add_recipe(gb_receipe %>% step_rm(date) %>% step_indicate_na())
gb_wf══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: boost_tree()
── Preprocessor ────────────────────────────────────────────────────────────────
6 Recipe Steps
• step_dummy()
• step_naomit()
• step_naomit()
• step_zv()
• step_rm()
• step_indicate_na()
── Model ───────────────────────────────────────────────────────────────────────
Boosted Tree Model Specification (regression)
Main Arguments:
trees = 1000
tree_depth = tune()
learn_rate = tune()
Computational engine: xgboost
folds <- NYSE_other %>% arrange(date) %>%
sliding_period(date, period = "month", lookback = Inf, assess_stop = 1)
# rolling_origin(initial = 5, assess = 1)
month_folds <- NYSE_other %>%
sliding_period(
date,
"month",
lookback = Inf,
skip = 4)gb_grid <-
grid_regular(
tree_depth(range = c(1L, 4L)),
learn_rate(range = c(-3, -0.5), trans = log10_trans()),
levels = c(4, 10)
)
gb_grid# A tibble: 40 × 2
tree_depth learn_rate
<int> <dbl>
1 1 0.001
2 2 0.001
3 3 0.001
4 4 0.001
5 1 0.00190
6 2 0.00190
7 3 0.00190
8 4 0.00190
9 1 0.00359
10 2 0.00359
# ℹ 30 more rows
Check the type of variables in the dataset.
dataset <- NYSE_other
# Loop through each column of the dataset
for (col_name in names(dataset)) {
# Check if the column is numeric
is_numeric <- is.numeric(dataset[[col_name]])
# Check if the column is categorical (a factor)
is_categorical <- is.factor(dataset[[col_name]])
# Print information about the column
cat("Variable:", col_name, "\n")
if (is_numeric) {
cat(" Data Type: Numeric\n")
} else if (is_categorical) {
cat(" Data Type: Categorical\n")
} else {
cat(" Data Type: Other\n")
}
}Variable: date
Data Type: Other
Variable: day_of_week
Data Type: Other
Variable: DJ_return
Data Type: Numeric
Variable: log_volume
Data Type: Numeric
Variable: log_volatility
Data Type: Numeric
Variable: DJ_return_lag1
Data Type: Numeric
Variable: log_volume_lag1
Data Type: Numeric
Variable: log_volatility_lag1
Data Type: Numeric
Variable: DJ_return_lag2
Data Type: Numeric
Variable: log_volume_lag2
Data Type: Numeric
Variable: log_volatility_lag2
Data Type: Numeric
Variable: DJ_return_lag3
Data Type: Numeric
Variable: log_volume_lag3
Data Type: Numeric
Variable: log_volatility_lag3
Data Type: Numeric
Variable: DJ_return_lag4
Data Type: Numeric
Variable: log_volume_lag4
Data Type: Numeric
Variable: log_volatility_lag4
Data Type: Numeric
Variable: DJ_return_lag5
Data Type: Numeric
Variable: log_volume_lag5
Data Type: Numeric
Variable: log_volatility_lag5
Data Type: Numeric
gb_fit <- tune_grid(gb_wf, resamples = month_folds, grid = gb_grid)
gb_fit# Tuning results
# Sliding period resampling
# A tibble: 200 × 4
splits id .metrics .notes
<list> <chr> <list> <list>
1 <split [98/22]> Slice001 <tibble [80 × 6]> <tibble [0 × 3]>
2 <split [120/20]> Slice002 <tibble [80 × 6]> <tibble [0 × 3]>
3 <split [140/22]> Slice003 <tibble [80 × 6]> <tibble [0 × 3]>
4 <split [162/22]> Slice004 <tibble [80 × 6]> <tibble [0 × 3]>
5 <split [184/20]> Slice005 <tibble [80 × 6]> <tibble [0 × 3]>
6 <split [204/23]> Slice006 <tibble [80 × 6]> <tibble [0 × 3]>
7 <split [227/18]> Slice007 <tibble [80 × 6]> <tibble [0 × 3]>
8 <split [245/21]> Slice008 <tibble [80 × 6]> <tibble [0 × 3]>
9 <split [266/22]> Slice009 <tibble [80 × 6]> <tibble [0 × 3]>
10 <split [288/19]> Slice010 <tibble [80 × 6]> <tibble [0 × 3]>
# ℹ 190 more rows
CV R2:
cv_rsq_gb <- gb_fit %>%
collect_metrics() %>%
filter(.metric == "rsq") %>%
pull(mean)
cv_rsq_gb <- mean(cv_rsq_gb) %>% round(2)
print(paste("boosting cv R2: ", cv_rsq_gb))[1] "boosting cv R2: 0.34"
Test R2:
gb_fit %>%
show_best("rsq")# A tibble: 5 × 8
tree_depth learn_rate .metric .estimator mean n std_err .config
<int> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 3 0.0129 rsq standard 0.389 200 0.0157 Preprocessor1_Mo…
2 3 0.0245 rsq standard 0.389 200 0.0157 Preprocessor1_Mo…
3 2 0.0464 rsq standard 0.387 200 0.0156 Preprocessor1_Mo…
4 2 0.0245 rsq standard 0.387 200 0.0155 Preprocessor1_Mo…
5 4 0.0129 rsq standard 0.387 200 0.0155 Preprocessor1_Mo…
best_gb <- gb_fit %>%
select_best("rsq")
best_gb# A tibble: 1 × 3
tree_depth learn_rate .config
<int> <dbl> <chr>
1 3 0.0129 Preprocessor1_Model19
# Final workflow
final_wf_gb <- gb_wf %>%
finalize_workflow(best_gb)
final_wf_gb══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: boost_tree()
── Preprocessor ────────────────────────────────────────────────────────────────
6 Recipe Steps
• step_dummy()
• step_naomit()
• step_naomit()
• step_zv()
• step_rm()
• step_indicate_na()
── Model ───────────────────────────────────────────────────────────────────────
Boosted Tree Model Specification (regression)
Main Arguments:
trees = 1000
tree_depth = 3
learn_rate = 0.0129154966501488
Computational engine: xgboost
# Fit the whole training set, then predict the test cases
final_fit_gb <- fit(final_wf_gb, data = NYSE_other)
predictions_gb <- predict(final_fit_gb, NYSE_test)
predictions_gb# A tibble: 1,770 × 1
.pred
<dbl>
1 -0.124
2 -0.0340
3 0.133
4 -0.0575
5 0.475
6 0.471
7 0.315
8 0.385
9 0.313
10 0.470
# ℹ 1,760 more rows
#test metrics
results <- bind_cols(NYSE_test %>% select(log_volume), predictions_gb)
results# A tibble: 1,770 × 2
log_volume .pred
<dbl> <dbl>
1 0.118 -0.124
2 0.332 -0.0340
3 0.0780 0.133
4 0.206 -0.0575
5 0.386 0.475
6 0.582 0.471
7 0.423 0.315
8 0.361 0.385
9 0.358 0.313
10 0.343 0.470
# ℹ 1,760 more rows
test_rsq_gb <- rsq(results, truth = log_volume, estimate = .pred) %>%
filter(.metric == "rsq") %>%
pull(.estimate) %>% round(2)
print(paste("boosting test R2: ", test_rsq_gb))[1] "boosting test R2: 0.54"
1.8 Summary (30pts)
Your score for this question is largely determined by your final test performance.
Summarize the performance of different machine learning forecasters in the following format.
| Method | CV \(R^2\) | Test \(R^2\) | |
|---|---|---|---|
| Baseline | |||
| AR(5) | |||
| Random Forest | |||
| Boosting |
tibble(
Method = c("Baseline", "AR(5)", "Random Forest", "Boosting"),
`CV R2` = c(NA, cv_rsq_en, cv_rsq_rf, cv_rsq_gb),
`Test R2` = c(r2_test_strawman, test_rsq_en, test_rsq_rf, test_rsq_gb)
)# A tibble: 4 × 3
Method `CV R2` `Test R2`
<chr> <dbl> <dbl>
1 Baseline NA 0.35
2 AR(5) 0.38 0.55
3 Random Forest 0.34 0.51
4 Boosting 0.34 0.54
2 ISL Exercise 12.6.13 (90 pts)
2.1 12.6.13 (b) (30 pts)
On the book website, www.statlearning.com, there is a gene expression data set (Ch12Ex13.csv) that consists of 40 tissue samples with measurements on 1,000 genes. The first 20 samples are from healthy patients, while the second 20 are from a diseased group.
Apply hierarchical clustering to the samples using correlationbased distance, and plot the dendrogram. Do the genes separate the samples into the two groups? Do your results depend on the type of linkage used?
Ch12Ex13 <- read_csv("https://www.statlearning.com/s/Ch12Ex13.csv", col_names = paste("ID", 1:40, sep = ""))Average method:
set.seed(838383)
hc_spec <- hier_clust(
# num_clusters = 3,
linkage_method = "average"
)
hc_fit <- hc_spec %>%
fit(~ .,
data = as.data.frame(t(Ch12Ex13))
)
hc_fit %>%
summary() Length Class Mode
spec 4 hier_clust list
fit 7 hclust list
elapsed 1 -none- list
preproc 4 -none- list
hc_fit$fit %>% plot()Complete method:
set.seed(838383)
hc_spec <- hier_clust(
# num_clusters = 3,
linkage_method = "complete"
)
hc_fit <- hc_spec %>%
fit(~ .,
data = as.data.frame(t(Ch12Ex13))
)
hc_fit %>%
summary() Length Class Mode
spec 4 hier_clust list
fit 7 hclust list
elapsed 1 -none- list
preproc 4 -none- list
hc_fit$fit %>% plot()Single method:
set.seed(838383)
hc_spec <- hier_clust(
# num_clusters = 3,
linkage_method = "single"
)
hc_fit <- hc_spec %>%
fit(~ .,
data = as.data.frame(t(Ch12Ex13))
)
hc_fit %>%
summary() Length Class Mode
spec 4 hier_clust list
fit 7 hclust list
elapsed 1 -none- list
preproc 4 -none- list
hc_fit$fit %>% plot()Centroid method:
set.seed(838383)
hc_spec <- hier_clust(
# num_clusters = 3,
linkage_method = "centroid"
)
hc_fit <- hc_spec %>%
fit(~ .,
data = as.data.frame(t(Ch12Ex13))
)
hc_fit %>%
summary() Length Class Mode
spec 4 hier_clust list
fit 7 hclust list
elapsed 1 -none- list
preproc 4 -none- list
hc_fit$fit %>% plot()The genes separate the samples into the two groups. And my results do not depend on the type of linkage used since all four linkage methods used suggest two main clusters.
2.2 PCA and UMAP (30 pts)
Load necessary libraries.
library(recipes)
library(tidymodels)
library(tidyverse)
library(embed)Transpose the data.
transposed_data <- Ch12Ex13 %>%
t() %>%
as.data.frame() %>%
rownames_to_column() %>%
as_tibble() %>%
rename(ID = rowname)
transposed_data# A tibble: 40 × 1,001
ID V1 V2 V3 V4 V5 V6 V7 V8 V9
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 ID1 -0.962 -0.293 0.259 -1.15 0.196 0.0301 0.0854 1.12 -1.22
2 ID2 0.442 -1.14 -0.973 -2.21 0.593 -0.691 -1.11 1.34 -1.28
3 ID3 -0.975 0.196 0.588 -0.862 0.283 -0.403 -0.678 0.103 -0.559
4 ID4 1.42 -1.28 -0.800 0.631 0.247 -0.730 -0.563 0.391 -1.34
5 ID5 0.819 -0.251 -1.82 0.952 1.98 -0.364 0.938 -1.93 1.16
6 ID6 0.316 2.51 -2.06 -1.17 -0.871 1.13 0.119 0.452 -1.50
7 ID7 -0.0250 -0.922 -0.0648 -0.392 -0.990 -1.40 -2.19 -1.35 -0.554
8 ID8 -0.0640 0.0595 1.59 1.06 -1.03 -0.806 0.685 0.625 0.691
9 ID9 0.0315 -1.41 -0.173 -0.350 -1.11 -1.24 0.262 0.816 -0.882
10 ID10 -0.350 -0.657 -0.121 -1.49 -0.385 0.578 -1.23 -0.358 0.454
# ℹ 30 more rows
# ℹ 991 more variables: V10 <dbl>, V11 <dbl>, V12 <dbl>, V13 <dbl>, V14 <dbl>,
# V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>, V20 <dbl>,
# V21 <dbl>, V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>, V26 <dbl>,
# V27 <dbl>, V28 <dbl>, V29 <dbl>, V30 <dbl>, V31 <dbl>, V32 <dbl>,
# V33 <dbl>, V34 <dbl>, V35 <dbl>, V36 <dbl>, V37 <dbl>, V38 <dbl>,
# V39 <dbl>, V40 <dbl>, V41 <dbl>, V42 <dbl>, V43 <dbl>, V44 <dbl>, …
names(transposed_data) <- c("ID", paste(seq(1, 1000), ""))
# names(transposed_data)
transposed_data <- transposed_data %>%
mutate(status = as.character(rep(c("Healthy", "Disease"), each = 20)),
.after = "ID")
transposed_data# A tibble: 40 × 1,002
ID status `1 ` `2 ` `3 ` `4 ` `5 ` `6 ` `7 ` `8 `
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 ID1 Healthy -0.962 -0.293 0.259 -1.15 0.196 0.0301 0.0854 1.12
2 ID2 Healthy 0.442 -1.14 -0.973 -2.21 0.593 -0.691 -1.11 1.34
3 ID3 Healthy -0.975 0.196 0.588 -0.862 0.283 -0.403 -0.678 0.103
4 ID4 Healthy 1.42 -1.28 -0.800 0.631 0.247 -0.730 -0.563 0.391
5 ID5 Healthy 0.819 -0.251 -1.82 0.952 1.98 -0.364 0.938 -1.93
6 ID6 Healthy 0.316 2.51 -2.06 -1.17 -0.871 1.13 0.119 0.452
7 ID7 Healthy -0.0250 -0.922 -0.0648 -0.392 -0.990 -1.40 -2.19 -1.35
8 ID8 Healthy -0.0640 0.0595 1.59 1.06 -1.03 -0.806 0.685 0.625
9 ID9 Healthy 0.0315 -1.41 -0.173 -0.350 -1.11 -1.24 0.262 0.816
10 ID10 Healthy -0.350 -0.657 -0.121 -1.49 -0.385 0.578 -1.23 -0.358
# ℹ 30 more rows
# ℹ 992 more variables: `9 ` <dbl>, `10 ` <dbl>, `11 ` <dbl>, `12 ` <dbl>,
# `13 ` <dbl>, `14 ` <dbl>, `15 ` <dbl>, `16 ` <dbl>, `17 ` <dbl>,
# `18 ` <dbl>, `19 ` <dbl>, `20 ` <dbl>, `21 ` <dbl>, `22 ` <dbl>,
# `23 ` <dbl>, `24 ` <dbl>, `25 ` <dbl>, `26 ` <dbl>, `27 ` <dbl>,
# `28 ` <dbl>, `29 ` <dbl>, `30 ` <dbl>, `31 ` <dbl>, `32 ` <dbl>,
# `33 ` <dbl>, `34 ` <dbl>, `35 ` <dbl>, `36 ` <dbl>, `37 ` <dbl>, …
# tail(transposed_data)Create the recipe for PCA.
pca_rec <- recipe(~., data = transposed_data) %>%
update_role(ID, status, new_role = "ID") %>%
step_normalize(all_predictors()) %>%
step_zv(all_numeric()) %>%
step_pca(all_predictors())
pca_prep <- prep(pca_rec)
pca_prepPlot the PCA.
juice(pca_prep) %>%
ggplot(aes(x = PC1, y = PC2, label = ID)) +
geom_point(aes(color = status), alpha = 0.7, size = 2) +
labs(color = "Status")Create the recipe for UMAP.
umap_rec <- recipe(~., data = transposed_data) %>%
update_role(ID, status, new_role = "ID") %>%
step_normalize(all_predictors()) %>%
step_umap(all_predictors(), neighbors = 20, min_dist = 0.01)
umap_prep <- prep(umap_rec)Plot the UMAP.
juice(umap_prep) %>%
ggplot(aes(x = UMAP1, y = UMAP2, label = ID)) +
geom_point(aes(color = status), alpha = 0.7, size = 2) +
labs(color = "Status")2.3 12.6.13 (c) (30 pts)
Your collaborator wants to know which genes differ the most across the two groups. Suggest a way to answer this question, and apply it here.
grp <- factor(rep(c(1, 0), each = 20))
regression <- function(y) {
sum <- summary(lm(y ~ grp))
pv <- sum$coefficients[2, 4]
return(pv)
}
out <- tibble(
gene = seq(1, nrow(Ch12Ex13)),
p_values = unlist(purrr::map(1:nrow(Ch12Ex13),
~ regression(as.matrix(Ch12Ex13)[.x, ])))
)out %>%
arrange(p_values) %>%
head(10)# A tibble: 10 × 2
gene p_values
<int> <dbl>
1 502 1.43e-12
2 589 3.19e-12
3 600 9.81e-12
4 590 6.28e-11
5 565 1.74e-10
6 551 1.10e- 9
7 593 1.19e- 9
8 584 1.65e- 9
9 538 2.42e- 9
10 569 2.69e- 9
# sig <- out %>% arrange(p_values) %>% filter(p_values < 0.05/nrow(Ch12Ex13))
sig <- out %>%
arrange(p_values) %>%
filter(p_values < 0.05)library(pheatmap)
library(ggplotify) ## to convert pheatmap to ggplot2
library(heatmaply) ## for constructing interactive heatmap# create data frame for annotations
dfh <- data.frame(sample = as.character(colnames(Ch12Ex13)),
status = "disease") %>%
column_to_rownames("sample")
dfh$status[seq(21, 40)] <- "healthy"
dfh status
ID1 disease
ID2 disease
ID3 disease
ID4 disease
ID5 disease
ID6 disease
ID7 disease
ID8 disease
ID9 disease
ID10 disease
ID11 disease
ID12 disease
ID13 disease
ID14 disease
ID15 disease
ID16 disease
ID17 disease
ID18 disease
ID19 disease
ID20 disease
ID21 healthy
ID22 healthy
ID23 healthy
ID24 healthy
ID25 healthy
ID26 healthy
ID27 healthy
ID28 healthy
ID29 healthy
ID30 healthy
ID31 healthy
ID32 healthy
ID33 healthy
ID34 healthy
ID35 healthy
ID36 healthy
ID37 healthy
ID38 healthy
ID39 healthy
ID40 healthy
pheatmap(Ch12Ex13[sig$gene, ],
cluster_rows = FALSE,
cluster_cols = T,
scale = "row",
annotation_col = dfh,
annotation_colors = list(status = c(disease = "orange", healthy = "black")),
color = colorRampPalette(c("navy", "white", "red"))(50)
)Based on p value table created above, we can see that the top 10 genes that differ the most across the two groups are 502, 589, 600, 590, 565, 551, 593, 584, 538, and 569.